package com.yaoandw.crawler;

import java.io.BufferedReader;
import java.io.StringReader;

import com.yaoandw.util.StringUtil;

/**
 * 去噪
 * @author yaowei
 *
 */
public class PlainTextTool {
	public static double plainRate = 0.43;//sina 163 
	public static String getPlainText(String script){
		StringBuffer result = new StringBuffer();
		if(!StringUtil.isEmpty(script)){
			script = script.replaceAll("<((?i)head)[.\\s\\S]*?</((?i)head)>", "");
			script = script.replaceAll("<((?i)style)[.\\s\\S]*?</((?i)style)>", "");
			script = script.replaceAll("<((?i)script)[.\\s\\S]*?</((?i)script)>", "");
			script = script.replaceAll("<((?i)noscript)[.\\s\\S]*?</((?i)noscript)>", "");
			script = script.replaceAll("<((?i)object)[.\\s\\S]*?</((?i)object)>", "");
			script = script.replaceAll("<!--[.\\s\\S]*?-->", "");//System.out.println(script);
			try{
				BufferedReader bf = new BufferedReader(new StringReader(script));
				String st;
				while ((st = bf.readLine()) != null) {
					String lineResult = processLine(st);
					if(lineResult != null&&!"".equals(lineResult.trim()))
						result.append(lineResult).append("\n");
				}
			}catch(Exception e){
				e.printStackTrace();
				return null;
			}
		}
		String text = result.toString().replaceAll("<[.\\s\\S]*?>", "");//去除掉换了行的html标签
		return text;
	}
	public static String getTitleText(String script){
		String title = "";
		return title;
	}

	private static String processLine(String st) {
		if(st != null){
			st = st.replaceAll("<[^/?a][^>]*>", "");//去除非链接标签(非<a>和</a>)
			int fullLength = st.getBytes().length;
			String plain = st.replaceAll("<[^>]*?>", "");//文本长度
			int plainLength = plain.getBytes().length;
			if(fullLength > 0){
				double percent = (double)plainLength/(double)fullLength;//文本密度
				if(percent > plainRate)
					return plain;
			}
		}
		return null;
	}
	public static void main(String[] args){
		String url = "http://sports.sina.com.cn/g/pl/2012-08-21/03516196236.shtml";
//		String url = "http://www.actiz.com:8080/bbs/forum.php?mod=viewthread&tid=64&extra=page%3D1";
//		String url = "http://news.163.com/12/0822/10/89GM586F00014JB5.html";
		ExtracPageInfo obj = HtmlParserTool.extracLinksAndText(url, null);
		String text = obj.getText();
//		System.out.println(text);
		System.out.println("============================================================");
//		text = text.replaceAll("<style[.\\s\\S]*?</style>", "");
//		text = text.replaceAll("<script[.\\s\\S]*?</script>", "");
//		text = text.replaceAll("<[^>]*>", "");
//		System.out.println(text);
		
		String plainText = PlainTextTool.getPlainText(text);
		System.out.println("============================================================");
		System.out.println(plainText);
	}
}
